import pandas as pd
import numpy as np
import statsmodels.api as sm


# Define the file path to the research project's replication package 
filepath = r'C:\Users\mehran.azimi\\'
# output file name
file_name='IS'

def IS():
    data_IS_1=(data_IS[[depvar,indepvar,'yearmo','monthnum']]).copy()
    data_IS_1=data_IS_1.dropna().reset_index()
    begdate=data_IS_1['yearmo'].iloc[0]
    enddate=data_IS_1['yearmo'].iloc[-1]
    firstmonthnum=data_IS_1[data_IS_1['yearmo']==begdate].index.values[0]
    lastmonth=data_IS_1[data_IS_1['yearmo']==enddate].index.values[0]
    initsamplesize= int((lastmonth - firstmonthnum)/2)+1
    nobservations=lastmonth-firstmonthnum+1
    nforecasts=nobservations - taw
        # subsample coefficients
    dummy=np.array([])        
    for i in range(nobservations-taw):
        if i <initsamplesize:  
            dummy=np.append(dummy,0)
        else:
            dummy=np.append(dummy,1)
    dummy= dummy.reshape(len(dummy),1)
    x=np.array(data_IS_1[indepvar][firstmonthnum:lastmonth+1-taw].astype(float))
    x= x.reshape(len(x),1)
    x=np.hstack((x,dummy, x*dummy))
    X = sm.add_constant(x)
    Y=np.array(data_IS_1[depvar][firstmonthnum+taw:lastmonth+1])
    model_subsamp = sm.OLS(Y, X)
    results_subsamp = model_subsamp.fit( cov_type='HAC',cov_kwds={'maxlags':12})  
    beta_1st=results_subsamp._results.params[1];beta_2nd=beta_1st+ results_subsamp._results.params[3]   
    t_stat_b1b2=results_subsamp._results.tvalues[3]
    x=np.array(data_IS_1[indepvar][firstmonthnum:lastmonth+1-taw].astype(float))
    X = sm.add_constant(x)
    Y=np.array(data_IS_1[depvar][firstmonthnum+taw:lastmonth+1])
    modelA1 = sm.OLS(Y, X)  
    resultsA1NW = modelA1.fit( cov_type='HAC',cov_kwds={'maxlags':12} )
    ehat= Y - ( resultsA1NW._results.params[0] + resultsA1NW._results.params[1]*x )
    ymean=Y.mean()
    R2=resultsA1NW._results.rsquared; R2adj=resultsA1NW._results.rsquared_adj
    beta=resultsA1NW._results.params[1]; t_stat= resultsA1NW._results.tvalues[1]
        # AR(1) for sentiment
    x=np.array(data_IS_1[indepvar][firstmonthnum:lastmonth+1-taw].astype(float))
    X = sm.add_constant(x)
    y=np.array(data_IS_1[indepvar][firstmonthnum+1:lastmonth+1-taw+1].astype(float))
    modelA2 = sm.OLS(y, X)
    resultsA2 = modelA2.fit()
        #to calculate A4 in Huang et al, first calculaye reduced biased estimates
        #calculate reduced bias estimates.
        #following Amihud and Hurvich method and notations
    rohat = resultsA2._results.params[1]
    rohat_corrected = rohat + (1+3*rohat)/len(y) + 3*(1+3*rohat)/len(y) **2
        #Now calculate A4
    phihat= y - ( resultsA2._results.params[0] + rohat_corrected*x )
        #create pseudo sample and run Bootstrap  
    beta_BStrap=np.array([])
    tstat_BStrap=np.array([])
    for b in range(BSsize):
        Wt1=np.random.normal(size=(len(ehat)))
        rtilda=ymean + ehat*Wt1
        xtilda=np.array([x[0]])
        for i in range(1,len(rtilda)):
            xtilda=np.append(xtilda, resultsA2._results.params[0] + rohat_corrected* xtilda[i-1] + phihat[i-1]*Wt1[i-1] )
        modelBStrap=sm.OLS(rtilda,sm.add_constant(xtilda))
        resultsBStrap = modelBStrap.fit(cov_type='HAC',cov_kwds={'maxlags':12})
        beta_BStrap=np.append(beta_BStrap, resultsBStrap._results.params[1])
        tstat_BStrap=np.append(tstat_BStrap,resultsBStrap._results.tvalues[1])
    if beta<0:
        p_v_Bootstrap=len(tstat_BStrap[tstat_BStrap < resultsA1NW._results.tvalues[1]  ]) / len(tstat_BStrap) 
    else:
        p_v_Bootstrap=len(tstat_BStrap[tstat_BStrap > resultsA1NW._results.tvalues[1]  ]) / len(tstat_BStrap)
    if p_v_Bootstrap<0.01:
        sig='***'
    elif p_v_Bootstrap<0.05:
        sig='**'
    elif p_v_Bootstrap<0.1:
        sig='*'
    else:
        sig=''
    beta_sig = str(round(beta,2))+sig
    resultsIS.loc[len(resultsIS.index)]= [depvar,indepvar, taw, beta, t_stat, p_v_Bootstrap ,R2,R2adj , beta_1st, beta_2nd, t_stat_b1b2 , beta_sig ]



# To generate the reuslts in table 11 change "begdate" and "enddate" accordingly
begdate=199002 
enddate=202212  
initsamplesize=198 
BSsize=10000 

data_IS=pd.read_csv(filepath + 'AllData.csv')
data_IS=data_IS[ (data_IS['yearmo']>=begdate) & (data_IS['yearmo']<=enddate)  ]
data_IS=data_IS.reset_index()
 
_vars=['sii','gp', 'alpha', 'ogap_dyn', 'skvw', 'dy', 'dp', 'lty', 'avgcor', 'wtexas', 'bm', 'tbl', 'vrp', 'tail', 'ep', 'svar', 'infl', 'tms', 'eg', 'dfy', 'de', 'ndrbl', 'ntis', 'fbm','lzrt','dtoat','dfr','ltr']
date_filter = data_IS['date'] == '12/31/2022'  
indepvars_2022 = data_IS[  data_IS['date']=='12/31/2022' ][_vars].dropna(axis=1).columns.to_list()
indepvars_2021 = list(set(_vars) - set(indepvars_2022) ) + ['alpha']
  
  #normalize independent variables
# To get results for variables that end in 2021, substitute indepvars_2022 with indepvars_2021 below
for var in indepvars_2022: 
    data_IS[var]=(data_IS[var]-data_IS[var].mean())/data_IS[var].std(ddof=0)      

resultsIS=pd.DataFrame(columns=['depvar','indepvar', 'taw', 'beta', 't_stat', 'p_v_Bootstrap', 'R2','R2adj' , 'beta_1st', 'beta_2nd', 't_stat_b1b2' , 'beta_sig' ])

# Tables 6 and 8
for dep in ['logMktRF_','Sharpe_']: 
    for var in indepvars_2022: 
        for mt in [1,3,6]:
            taw=mt
            indepvar=var
            depvar=dep+str(mt)
            IS()
            print(dep, var, mt)
resultsIS.to_excel(filepath + file_name+'.xlsx', index=False)


